from collections import Counter
import math

#print(math.log(10,10))
a='asdqwesasasdq'
b='efqweqsawqwsdawq'

la=list(a)
lb=list(b)
seta=set(la)
setb=set(lb)
print(seta)
print(setb)
'''
tf
'''
#对字符串a进行词频统计
ca={}
print('字符串1：')
for item in seta:
    
    ca[item]=la.count(item)/len(la)
    print(item,'的tf为：',ca[item])

#对字符串b进行词频统计
cb={}
print('字符串2：')
for item in setb:
    
    cb[item]=lb.count(item)/len(lb)
    print(item,'的tf为：',cb[item])

'''
idf
'''

c=list(seta)+list(setb)

cc=set(c)
print(cc)
cf={}
idf={}
for item in cc:
    cf[item]= c.count(item)
print(cf)
for item in cf:
    print(item,' 的idf为：',math.log(2/cf[item],10))
    idf[item]=math.log(2/cf[item],10)
'''
tfidf
'''
tfidfa={}
for item in seta:
    tfidfa[item]=ca[item]*idf[item]
    print(item,'在字符串1中的tfidf为:',tfidfa[item])

tfidfb={}
for item in setb:
    tfidfb[item]=cb[item]*idf[item]
    print(item,'在字符串2中的tfidf为:',tfidfb[item])













